{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入常规的数据分析库\n",
    "from pyecharts import Bar,Pie\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import  matplotlib.pyplot as  plt\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "      <th>star</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>还行吧，建议不要排队那个烤鸭和羊肉串，因为烤肉时间本来就不够，排那个要半小时，然后再回来吃烤...</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>去过好几次了 东西还是老样子 没增添什么新花样 环境倒是挺不错 离我们这也挺近 味道还可以 ...</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>一个字：好！！！ #羊肉串# #五花肉# #牛舌# #很好吃# #鸡软骨# #拌菜# #抄河...</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>第一次来吃，之前看过好多推荐说这个好吃，真的抱了好大希望，排队的人挺多的，想吃得趁早来啊。还...</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>羊肉串真的不太好吃，那种说膻不膻说臭不臭的味。烤鸭还行，大虾没少吃，也就到那吃大虾了，吃完了...</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             comment  star\n",
       "0  还行吧，建议不要排队那个烤鸭和羊肉串，因为烤肉时间本来就不够，排那个要半小时，然后再回来吃烤...    40\n",
       "1  去过好几次了 东西还是老样子 没增添什么新花样 环境倒是挺不错 离我们这也挺近 味道还可以 ...    40\n",
       "2  一个字：好！！！ #羊肉串# #五花肉# #牛舌# #很好吃# #鸡软骨# #拌菜# #抄河...    50\n",
       "3  第一次来吃，之前看过好多推荐说这个好吃，真的抱了好大希望，排队的人挺多的，想吃得趁早来啊。还...    20\n",
       "4  羊肉串真的不太好吃，那种说膻不膻说臭不臭的味。烤鸭还行，大虾没少吃，也就到那吃大虾了，吃完了...    30"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取评论文件，取其评论列与平分列\n",
    "df=pd.read_excel(\"all_data_meituan.xlsx\")[[\"comment\",\"star\"]]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17400, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看数据的大小\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 以30分为划分界限，大于30分的记为正类1，其余的为负类0\n",
    "df['sentiment']=df['star'].apply(lambda x:1 if x>30 else 0)\n",
    "df=df.drop_duplicates() ## 去掉重复的评论 \n",
    "df=df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3138, 1)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 复制数据可以使用np.tile(arr, (3, 1))，延第一个轴的方向进行数据复制3倍。\n",
    "# 如果数据不复制的话，只有1100条左右的信息，数据量太小，已使用多种算法进行拟合，效果均相差较大\n",
    "X=pd.concat([df[['comment']],df[['comment']],df[['comment']]])\n",
    "y=pd.concat([df.sentiment,df.sentiment,df.sentiment])\n",
    "X.columns=['comment']\n",
    "X.reset_index\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "DEBUG:jieba:Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\HUANG_~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "DEBUG:jieba:Loading model from cache C:\\Users\\HUANG_~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.873 seconds.\n",
      "DEBUG:jieba:Loading model cost 0.873 seconds.\n",
      "Prefix dict has been built succesfully.\n",
      "DEBUG:jieba:Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0    还行 吧 ， 建议 不要 排队 那个 烤鸭 和 羊肉串 ， 因为 烤肉 时间 本来 就 不够...\n",
       "1    去过 好 几次 了   东西 还是 老 样子   没 增添 什么 新花样   环境 倒 是 ...\n",
       "2    一个 字 ： 好 ！ ！ ！   # 羊肉串 #   # 五花肉 #   # 牛舌 #   ...\n",
       "3    第一次 来 吃 ， 之前 看过 好多 推荐 说 这个 好吃 ， 真的 抱 了 好 大 希望 ...\n",
       "4    羊肉串 真的 不太 好吃 ， 那种 说 膻 不 膻 说 臭 不 臭 的 味 。 烤鸭 还 行...\n",
       "Name: cut_comment, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba\n",
    "def chinese_word_cut(mytext):\n",
    "    return \" \".join(jieba.cut(mytext))\n",
    "X['cut_comment']=X[\"comment\"].apply(chinese_word_cut)\n",
    "X['cut_comment'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import  train_test_split\n",
    "X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=42,test_size=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_custom_stopwords(stop_words_file):\n",
    "    with open(stop_words_file,encoding=\"utf-8\") as f:\n",
    "        custom_stopwords_list=[i.strip() for i in f.readlines()]\n",
    "    return custom_stopwords_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['100', '01', '02', '03', '04', '05', '06', '07', '08', '09']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stop_words_file = \"stopwords.txt\"\n",
    "stopwords = get_custom_stopwords(stop_words_file)\n",
    "stopwords[-10:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "        tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import  CountVectorizer\n",
    "vect=CountVectorizer()\n",
    "vect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<2353x1965 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 20491 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect.fit_transform(X_train[\"cut_comment\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2353, 1965)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect.fit_transform(X_train[\"cut_comment\"]).toarray().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pd.DataFrame(vect.fit_transform(X_train[\"cut_comment\"]).toarray(),columns=vect.get_feature_names()).iloc[:10,:22]\n",
    "# print(vect.get_feature_names())\n",
    "# #  数据维数1956，不算很大（未使用停用词）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>amazing</th>\n",
       "      <th>happy</th>\n",
       "      <th>ktv</th>\n",
       "      <th>pm2</th>\n",
       "      <th>一万个</th>\n",
       "      <th>一个多</th>\n",
       "      <th>一个月</th>\n",
       "      <th>一串</th>\n",
       "      <th>一人</th>\n",
       "      <th>一件</th>\n",
       "      <th>...</th>\n",
       "      <th>麻烦</th>\n",
       "      <th>麻酱</th>\n",
       "      <th>黄喉</th>\n",
       "      <th>黄桃</th>\n",
       "      <th>黄花鱼</th>\n",
       "      <th>黄金</th>\n",
       "      <th>黑乎乎</th>\n",
       "      <th>黑椒</th>\n",
       "      <th>黑胡椒</th>\n",
       "      <th>齐全</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1691 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   amazing  happy  ktv  pm2  一万个  一个多  一个月  一串  一人  一件 ...  麻烦  麻酱  黄喉  黄桃  \\\n",
       "0        0      0    0    0    0    0    0   0   0   0 ...   0   0   0   0   \n",
       "1        0      0    0    0    0    0    0   0   0   0 ...   0   0   0   0   \n",
       "2        0      0    0    0    0    0    0   0   0   0 ...   0   0   0   0   \n",
       "3        0      0    0    0    0    0    0   0   0   0 ...   0   0   0   0   \n",
       "4        0      0    0    0    0    0    0   0   0   0 ...   0   0   0   0   \n",
       "\n",
       "   黄花鱼  黄金  黑乎乎  黑椒  黑胡椒  齐全  \n",
       "0    0   0    0   0    0   0  \n",
       "1    0   0    0   0    0   0  \n",
       "2    0   0    0   0    0   0  \n",
       "3    0   0    0   0    0   0  \n",
       "4    0   0    0   0    0   0  \n",
       "\n",
       "[5 rows x 1691 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = CountVectorizer(token_pattern=u'(?u)\\\\b[^\\\\d\\\\W]\\\\w+\\\\b',stop_words=frozenset(stopwords)) # 去除停用词\n",
    "pd.DataFrame(vect.fit_transform(X_train['cut_comment']).toarray(), columns=vect.get_feature_names()).head()\n",
    "# 1691 columns,去掉以数字为特征值的列，减少了三列编程1691 \n",
    "# max_df = 0.8 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。\n",
    "# min_df = 3 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('countvectorizer',\n",
       "  CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "          lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "          ngram_range=(1, 1), preprocessor=None,\n",
       "          stop_words=frozenset({'', '第二讲', '从优', '云云', '亲自', '今', '後面', '将近', '反过来说', 'ones', '兮', '］', '了', '第十八', 'off', '实现', '你们', '立地', 'him', '乘隙', '到处', 'wherever', '乃至于', '全年', 'where', '进行', '一片', '从未', '日见', '既', 'nearly', '每每', '顺', '取得', '逐渐', '吗', '反之则', 'maybe', '57', '进步', 'same', '://', '路经', ... 'sure', '按说', 'therefore', '方才', '里面', '为', '但是', 'ie', '没', '恰似', '总的来说', '元／吨', '第四套', 'course'}),\n",
       "          strip_accents=None, token_pattern='(?u)\\\\b[^\\\\d\\\\W]\\\\w+\\\\b',\n",
       "          tokenizer=None, vocabulary=None)),\n",
       " ('logisticregression',\n",
       "  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
       "            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "            verbose=0, warm_start=False))]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "# lr=LogisticRegression(solver='saga',max_iter=10000)\n",
    "lr=LogisticRegression()\n",
    "pipe=make_pipeline(vect,lr)\n",
    "pipe.steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None,\n",
       "        stop_words=...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False))])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe.fit(X_train.cut_comment, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.87261146496815289"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "y_pred = pipe.predict(X_test.cut_comment)\n",
    "metrics.accuracy_score(y_test,y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[210,  79],\n",
       "       [ 21, 475]], dtype=int64)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.confusion_matrix(y_test,y_pred)\n",
    "#  准确率提高，同时降低了假阳性以及假阴性，saga的收敛的max_iter比默认的liblinear"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9177820267686424"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_all = pipe.predict(X['cut_comment'])\n",
    "metrics.f1_score(y_true=y,y_pred=y_pred_all)\n",
    "metrics.accuracy_score(y,y_pred_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None,\n",
       "        stop_words=...andom_state=None, refit=True,\n",
       "           scoring='accuracy', solver='saga', tol=0.0001, verbose=0))])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.metrics import log_loss\n",
    "lrvc = LogisticRegressionCV(Cs=[0.0001,0.005,0.001,0.05,0.01,0.1,0.5,1,10],scoring='accuracy',solver='saga',max_iter=10000,penalty='l2',verbose=0)\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "# lr=LogisticRegression(solver='saga',max_iter =10000)\n",
    "pipe=make_pipeline(vect,lrvc)\n",
    "# pipe=make_pipeline(vect,lr)\n",
    "# pipe.steps\n",
    "pipe.fit(X_train.cut_comment, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.89936305732484079"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = pipe.predict(X_test.cut_comment)\n",
    "metrics.accuracy_score(y_test,y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[235,  54],\n",
       "       [ 25, 471]], dtype=int64)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.confusion_matrix(y_test,y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAALUAAAC3CAYAAABUmx+iAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADzdJREFUeJzt3Xt0VOW5x/Hvk0RFIFEh4RKuQoEoKBoCAlZbQRSVFoOIgLZVoGCP5aaIZR2vqBWodIlS8YAHSwVEpRapVQriBUElIMWAwlEqWC4JdwVCrQGe88fsQIAw2STZsydvns9aWdmzs2fmt4YfO+/s2Xm3qCrGuCQh7ADGVDQrtXGOldo4x0ptnGOlNs6xUhvnWKmNc6zUgIi0EpFOInKGiCSGnSceVObXQar6hy8i0gv4LbDV+1oJ/FFV94UaLCQi0lJVv/CWE1X1cNiZTleV3lOLyBnALcBAVe0KvA40AkaLSEqo4UIgIj2A1SIyG0BVD1fGPXaVLrUnBWjhLf8FeAM4E+gvIhJaqhgTkRrAr4ERwPciMhMqZ7GrdKlVtRD4PdBLRK5Q1SPAUmA18MNQw8WYqhYAA4DZwCigWvFih5ntdFXpUns+ABYCPxORK1X1sKrOBtKBtuFGiy1V3aaqB1R1FzAEOLuo2CKSKSIZ4Sb0JynsAGFT1e9EZBagwBjvH+4/QF0gL9RwIVLV3SIyBPidiKwHEoGrQo7lS5UvNYCq7hWRacDnRPZQ3wG3qer2cJOFS1V3iUgucB3QTVW3hJ3Jjyp/SO9E3psi9cbXVZqInAe8Atyjqrlh5/HLSm2iEpFqqvpd2DlOh5XaOMeOfhjnWKmNc6zUxjlW6hKIyOCwM8STyvZ6WKlLVqn+EWOgUr0eVmrjnLg6pHde7VRt0LBx2DHYs2cXtWqlhh2Ds5LiY5+zc9dO0lLTQs2wZs2afd9//59z/GwbVx+TN2jYmFcXLgk7RtxoVrt62BHiRr06qTv8bhsfuwJjKpCV2jjHSm2cY6U2zrFSG+dYqY1zrNTGOVZq4xwrtXGOldo4x0ptnGOlNs6xUhvnWKmNc6zUxjlWauMcK7VxjpXaOMdKbZxjpTbOsVIb51ipjXOs1MY5VmrjHCu1cY6V2jjHSm2cY6U2zqlypc7buoXbe11Pjyva8ZMr2/PitGcBeHr8o9x4VUeyu3Zm0C092ZEfuS5ozrIP6NCiAdldO5PdtTPPThwXZvxAbd68mWuu7krbi1pzaduLmPz00wA8OvYRmjVpRId2mXRol8mCt94MOWl0gU7lKyLdgUlErpb6vKpGbUSbtpka9KynO7fns3N7PhdefAkFB/bT+5oreOaFOdRLT6dmcgoALz4/hX9+sZ6HJ0wiZ9kHvDBlElNmzg00V0liPetpXl4e+Xl5XJqZyf79++l0WXtenfsac+e+Ss2aNRl59z0xzVNcvTqpG/bu2dPCz7aBTeXrXWTzD0A3YAuwQkTmq+rnQT2nH2l165FWtx4ANWom06xFK3bkb+MHrY5ddvvfBwsQJKyIoalfvz7169cHIDk5mYyMDLZu2xpyqtMX5PCjA7BBVb9S1e+BOUDPAJ/vtG3919esW5vLxZlZADz1xCN0yczgjT+/wtDR/310u9Wf5JDdpROD+/Xiy/XrwoobU5s2bWL16tV06HAZAFOe/QNZl17C4EED2bt3b8jpoguy1A2AzcVub/HWHUdEBovIShFZuWfPrgDjHK+g4ADDB93GmLHjjg47Rox5iHdWrafHTX2YNX0qABde3Ja3V37OX975iFsHDmHoHf1iljEsBw4coF+fm3ly4u9JSUlh8JA7Wfd/X5LzySrq1a/PffeOCjtiVEGWuqTf3ycN4FV1qqpmqWpWrC5JUVhYyIiBt9GjVx+63XDyL48bsvuw6G+vA1AzOYUaNWoC8KOrr+VQYSF7d8fuP1+sFRYW0rdPb/r268+N2b0AqFu3LomJiSQkJDBg4CBWrlwRcsrogiz1FqBRsdsNgW0BPp8vqsoDI++iWYtW3H7n0KPrN3214ejyu39/k2Y/aAnAzh3bKXoznbtqJUf0COfWqh3b0DGiqgz55SAyMi5g+MiRR9fn5eUdXZ4/bx6tW7cOI55vQV7zZQXQQkTOB7YCfYH+AT6fL6tyPmL+3JdoeUFrsrt2BiLDjtde+hMbN3xJQkIC6Q0b8dCESQAs/Os85sx4nqSkJM6qVo2Jz72AiJtvIj9ctozZs2bSps1FdGiXCcDYxx7j5TlzyP30U0SEJk2bMPnZ50JOGl3Qh/SuB54ickhvuqo+Hm37WBzSq0zsQkbHxMUhPQBVfROI7yP1xjlV7hNF4z4rtXGOldo4x0ptnGOlNs6xUhvnWKmNc055nFpE9nPsXI2ij9DUW1ZVTQk4mzFlcspSq2pyLIMYU1F8DT9E5Icicoe3nOqdz2FMXCq11CLyEHAfMMZbdSYwM8hQxpSHnz11NvBToABAVbcBNjQxcctPqb/XyKl8CiAiNYKNZEz5+Cn1KyLyP8C5IvJL4G1gWrCxjCm7Uk89VdUnRaQbsA9oCTyoqosCT2ZMGfk9n3oNcDaRIcia4OIYU35+jn4MAnKAXkBv4GMRGRB0MGPKys+e+l7gUlXdDSAitYEPgelBBjOmrPy8UdwC7C92ez/Hz+dhTFyJdu7H3d7iVmC5iLxOZEzdk8hwxJi4FG34UfQByz+9ryKvBxfHmPKLdkLTI7EMYkxFKfWNooikAaOB1kC1ovWq2iXAXMaUmZ83irOA9cD5wCPAJiKzLxkTl/yUuraq/i9QqKrvq+oAoGPAuYwpMz/HqQu973kicgORSR4bBhfJmPLxU+rHROQc4B7gGSAFGBn9LsaEx88JTW94i98CVwUbx5jyi/bhyzOUMEl6EVUdVtFhqp2RQMu0mhX9sJXW35fauWNFvtl30Pe20fbUK8sfxZjYi/bhy4xYBjGmothkNsY5VmrjHCu1cY6fv3xpKSKLRWStd/tiEbk/+GjGlI2fPfU0IhPZFAKoai6RK20ZE5f8lLq6qp74RwGHgghjTEXwU+pdItKcY5PZ9Abyot/FmPD4OffjLmAqkCEiW4GNwG2BpjKmHPyc+/EVcLU33ViCqu4v7T7GhMnPX748eMJtAFR1bECZjCkXP8OPgmLL1YAewLpg4hhTfn6GHxOL3xaRJ4H5gSUyppzK8olidaBZRQcxpqL4GVOv4dh51YlAGmDjaRO3/IypexRbPgRsV1X78MXErailFpEE4G+q2iZGeYwpt6hjalU9AnwqIo1jlMeYcvMz/KgPfCYiORQ7vKeqPw0slTHl4KfUNqeeqVT8lPp6Vb2v+AoRGQ+8H0wkY8rHz3HqbiWsu66igxhTUaLN+/Er4L+AZiKSW+xHycCyoIMZU1bRhh+zgbeAJ4DfFFu/X1X3BJrKmHKINu/Ht0SmGusXuzjGlJ/9NblxjpXaOMdKbZzj9zLOzmrerCnJyckkJiaSlJTE8pxj82JOnPgk942+l/ztO0lNTQ0xZbAOHz7MiMH9qJ1Wh4fHTWb0r3/BwX9HZhn9du8eWl7Qhgcen8Tmrzfy1LgH2PDlOn4+aCg39b093OCnEFipRWQ6kTP8dsT7CVFvL373pNJu3ryZtxctonFj9097mT93Fo2anM/Bg5GzICZMPjY36OMPjKTj5ZFpyZNTUhgy7Dd8tPSdUHL6FeTw449A9wAfP1D33D2SceMnHP2bTFft2pHPio+XcG2PXif97ODBAj5dlUOnKyIXYjv3vNq0vKANSUnx/Qs+sFKr6hIg7o9niwjXdb+GDu3bMW3qVAD+On8+DRo0oG3btiGnC97UyRO44867iZxlfLyPlizmknaXUb1G5ZoIP/T/ciIyGBgMhPKrfskHy0hPT2fHjh10v7YbrTIy+O0Tj7NgwcKYZ4m1nA/f55xza9Gi1YXk/uPkqwi+v/itEvfg8S70ox+qOlVVs1Q1Ky0tLebPn56eDkCdOnXoeWM2S5a8z6aNG8m8tC3NmzVly5YttM/KJD8/P+bZgvb52tUs//A97rilO+PHjiZ3VQ6/e2wMAPu+/YYv1q+lfccrww1ZBqHvqcNUUFDAkSNHSE5OpqCggEWLFnL//Q+Sl7/j6DbNmzVlec5KJ49+3D54OLcPHg5A7j9W8NrLM7j3/icAWPreQjp0upIzzzorzIhlUqVLvX37dnrflA3AoUOH6NuvP927V9r3thVqyTsL6N1/wHHr9uzexYghfTlYUEBCQgKvz53JczPmxd2YW1RPeQGu8j2wyEvAj4FUYDvwkHfl3FPKysrS4seJqzq7OtcxN3Rpv0EPfdfCz7aB7alV1U6EMqEI/Y2iMRXNSm2cY6U2zrFSG+dYqY1zrNTGOVZq4xwrtXGOldo4x0ptnGOlNs6xUhvnWKmNc6zUxjlWauMcK7VxjpXaOMdKbZxjpTbOsVIb51ipjXOs1MY5VmrjHCu1cY6V2jjHSm2cY6U2zrFSG+cENutpWYjITuDrsHMQmal1V9gh4kg8vB5NVNXXrPxxVep4ISIrVTUr7BzxorK9Hjb8MM6xUhvnWKlLNjXsAHGmUr0eNqY2zrE9dQyJyAHve7qIzC1l2xEiUr2UbTaJSNTLhhU952lkfFhERp3OfeKNlbqcRCTxdO+jqttUtXcpm40AopbalMxKHYWINBWR9SIyQ0RyRWSuiFT39pAPishS4GYRaS4iC0TkExH5QEQyvPufLyIficgKEXn0hMdd6y0nisiTIrLGe46hIjIMSAfeFZF3fWad5z3/Z95VhIv/bKKIrBKRxSKS5q0rMbMTVNW+TvEFNAUUuNy7PR0YBWwCRhfbbjHQwlu+DHjHW54P/Nxbvgs4UOxx13rLvwL+DCR5t2t53zcBqaXkO7pNsfudDawFanu3FbjVW34QmFxK5oeBUWG/9uX5qtIXB/Vps6ou85ZnAsO85ZcBRKQm0Bl4VUSK7lN0mdjLgZu85ReB8SU8/tXAc6p6CEBV95Qx5zARyfaWGwEtgN3AkaKsXv7XSslc6VmpS3fi4aGi2wXe9wTgG1W9xOf9TyQ+ton+ACI/JvKfo5OqHhSR94BqUfKUlrlSszF16RqLSCdvuR+wtPgPVXUfsFFEbgaQiLbej5cBfb3lW0/x+AuBO0Ukybt/LW/9fiDZZ8ZzgL1eoTOAjsV+lgAUvSntDywtJXOlZ6Uu3TrgFyKSC9QCppSwza3AQBH5FPgM6OmtHw7cJSIriBSvJM8D/wJyvfv399ZPBd7y+UZxAZDkZXwU+LjYzwqA1iLyCdAFGFtK5krPPnyJQkSaAm+oapuQo5jTYHtq4xzbU1cCIrKck49O/ExV14SRJ95ZqY1zbPhhnGOlNs6xUhvnWKmNc/4fNZrnQkAXrFgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 180x180 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def get_confusion_matrix(conf,clas):\n",
    "    import  matplotlib.pyplot as  plt\n",
    "    fig,ax=plt.subplots(figsize=(2.5,2.5))\n",
    "    ax.matshow(conf,cmap=plt.cm.Blues,alpha=0.3)\n",
    "    tick_marks = np.arange(len(clas))\n",
    "    plt.xticks(tick_marks,clas, rotation=45)\n",
    "    plt.yticks(tick_marks, clas)\n",
    "    for i in range(conf.shape[0]):\n",
    "        for j in range(conf.shape[1]):\n",
    "            ax.text(x=i,y=j,s=conf[i,j],\n",
    "                   va='center',\n",
    "                   ha='center')\n",
    "    plt.xlabel(\"predict_label\")\n",
    "    plt.ylabel(\"true label\")\n",
    "    \n",
    "conf=metrics.confusion_matrix(y_test,y_pred)\n",
    "class_names=np.array(['0','1'])  # 处理二分类数据\n",
    "get_confusion_matrix(np.array(conf),clas=class_names)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
